At AllLife bank a model has be built to help the marketing department to identify the potential customers who have a higher probability of purchasing the loan.
import pandas as pd #importing Pandas for dataset import
import matplotlib.pyplot as plt #importing for basic visualization and seaborn
import seaborn as sns #importing for better visualization features
import warnings # importing to suppress the warnings of deprecated features
import numpy as np #importing numpy
warnings.filterwarnings('ignore') # suppress the warnings
#for inline plotting
%matplotlib inline
pd.set_option('display.float_format', lambda x: '%.5f' % x) #to suppress scientific notations
# Removes the limit for the number of displayed columns
pd.set_option("display.max_columns", None)
# Sets the limit for the number of displayed rows
pd.set_option("display.max_rows", 200)
# To build linear model for prediction
from sklearn.linear_model import LinearRegression
# To check model performance
from sklearn.model_selection import train_test_split
# Sets the limit for the number of displayed rows
pd.set_option("display.max_rows", 200)
# Libraries to build decision tree classifier
from sklearn.tree import DecisionTreeClassifier
from sklearn import tree
# To tune different models
from sklearn.model_selection import GridSearchCV
# To get diferent metric scores
from sklearn.metrics import (
f1_score,
accuracy_score,
recall_score,
precision_score,
confusion_matrix,
plot_confusion_matrix,
make_scorer,
)
#importing dataset stored in csv file and setting 1st cloumn as index
df = pd.read_csv(r'C:\Users\user\Desktop\AI_ML_Austin\notebook prog files\ProjectWork3\Loan_Modelling.csv')
# creating a copy of the data so that original data remains unchanged
loandf1 = df.copy()
loandf1.head(10) #display top 10 rows
| ID | Age | Experience | Income | ZIPCode | Family | CCAvg | Education | Mortgage | Personal_Loan | Securities_Account | CD_Account | Online | CreditCard | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1 | 25 | 1 | 49 | 91107 | 4 | 1.60000 | 1 | 0 | 0 | 1 | 0 | 0 | 0 |
| 1 | 2 | 45 | 19 | 34 | 90089 | 3 | 1.50000 | 1 | 0 | 0 | 1 | 0 | 0 | 0 |
| 2 | 3 | 39 | 15 | 11 | 94720 | 1 | 1.00000 | 1 | 0 | 0 | 0 | 0 | 0 | 0 |
| 3 | 4 | 35 | 9 | 100 | 94112 | 1 | 2.70000 | 2 | 0 | 0 | 0 | 0 | 0 | 0 |
| 4 | 5 | 35 | 8 | 45 | 91330 | 4 | 1.00000 | 2 | 0 | 0 | 0 | 0 | 0 | 1 |
| 5 | 6 | 37 | 13 | 29 | 92121 | 4 | 0.40000 | 2 | 155 | 0 | 0 | 0 | 1 | 0 |
| 6 | 7 | 53 | 27 | 72 | 91711 | 2 | 1.50000 | 2 | 0 | 0 | 0 | 0 | 1 | 0 |
| 7 | 8 | 50 | 24 | 22 | 93943 | 1 | 0.30000 | 3 | 0 | 0 | 0 | 0 | 0 | 1 |
| 8 | 9 | 35 | 10 | 81 | 90089 | 3 | 0.60000 | 2 | 104 | 0 | 0 | 0 | 1 | 0 |
| 9 | 10 | 34 | 9 | 180 | 93023 | 1 | 8.90000 | 3 | 0 | 1 | 0 | 0 | 0 | 0 |
Its always better to check the random rows instead of top rows.
np.random.seed(1) #random set with seed 1, so the same set of random values can be generated everytime
loandf1.sample(n=20) #Print 20 random rows to check the dataset
| ID | Age | Experience | Income | ZIPCode | Family | CCAvg | Education | Mortgage | Personal_Loan | Securities_Account | CD_Account | Online | CreditCard | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 2764 | 2765 | 31 | 5 | 84 | 91320 | 1 | 2.90000 | 3 | 105 | 0 | 0 | 0 | 0 | 1 |
| 4767 | 4768 | 35 | 9 | 45 | 90639 | 3 | 0.90000 | 1 | 101 | 0 | 1 | 0 | 0 | 0 |
| 3814 | 3815 | 34 | 9 | 35 | 94304 | 3 | 1.30000 | 1 | 0 | 0 | 0 | 0 | 0 | 0 |
| 3499 | 3500 | 49 | 23 | 114 | 94550 | 1 | 0.30000 | 1 | 286 | 0 | 0 | 0 | 1 | 0 |
| 2735 | 2736 | 36 | 12 | 70 | 92131 | 3 | 2.60000 | 2 | 165 | 0 | 0 | 0 | 1 | 0 |
| 3922 | 3923 | 31 | 4 | 20 | 95616 | 4 | 1.50000 | 2 | 0 | 0 | 0 | 0 | 1 | 0 |
| 2701 | 2702 | 50 | 26 | 55 | 94305 | 1 | 1.60000 | 2 | 0 | 0 | 0 | 0 | 1 | 0 |
| 1179 | 1180 | 36 | 11 | 98 | 90291 | 3 | 1.20000 | 3 | 0 | 0 | 1 | 0 | 0 | 1 |
| 932 | 933 | 51 | 27 | 112 | 94720 | 3 | 1.80000 | 2 | 0 | 0 | 1 | 1 | 1 | 1 |
| 792 | 793 | 41 | 16 | 98 | 93117 | 1 | 4.00000 | 3 | 0 | 0 | 0 | 0 | 0 | 1 |
| 1852 | 1853 | 32 | 6 | 54 | 94596 | 4 | 1.80000 | 3 | 167 | 0 | 0 | 0 | 0 | 0 |
| 1185 | 1186 | 43 | 19 | 31 | 94025 | 3 | 0.50000 | 1 | 0 | 0 | 0 | 0 | 0 | 0 |
| 1724 | 1725 | 46 | 19 | 24 | 90025 | 3 | 0.67000 | 2 | 0 | 0 | 0 | 0 | 1 | 0 |
| 4080 | 4081 | 27 | 0 | 40 | 90068 | 1 | 2.00000 | 2 | 110 | 0 | 0 | 0 | 0 | 1 |
| 3823 | 3824 | 49 | 25 | 44 | 94708 | 4 | 0.90000 | 2 | 194 | 0 | 0 | 0 | 1 | 0 |
| 4054 | 4055 | 59 | 34 | 64 | 94116 | 4 | 1.70000 | 1 | 0 | 0 | 0 | 0 | 0 | 0 |
| 2721 | 2722 | 58 | 33 | 173 | 92121 | 2 | 7.20000 | 3 | 0 | 1 | 0 | 0 | 1 | 0 |
| 3903 | 3904 | 47 | 23 | 65 | 93943 | 1 | 0.00000 | 1 | 0 | 0 | 0 | 0 | 0 | 0 |
| 1865 | 1866 | 36 | 6 | 90 | 91342 | 4 | 1.80000 | 3 | 0 | 0 | 1 | 0 | 0 | 0 |
| 759 | 760 | 53 | 28 | 59 | 91950 | 2 | 1.90000 | 2 | 0 | 0 | 0 | 0 | 1 | 0 |
The dataset looks consistent with the description provided in the Data Dictionary. Lots of data have values which need to be processed so the data can be effectively used for analysis
print(f'There are {loandf1.shape[0]} rows and {loandf1.shape[1]} columns.') # f-string
There are 5000 rows and 14 columns.
There are 5000 rows/observations of 14 columns
loandf1.dtypes.value_counts() #Gives count of the different datatype in dataset
int64 13 float64 1 dtype: int64
loandf1.info() #display complete info
<class 'pandas.core.frame.DataFrame'> RangeIndex: 5000 entries, 0 to 4999 Data columns (total 14 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 ID 5000 non-null int64 1 Age 5000 non-null int64 2 Experience 5000 non-null int64 3 Income 5000 non-null int64 4 ZIPCode 5000 non-null int64 5 Family 5000 non-null int64 6 CCAvg 5000 non-null float64 7 Education 5000 non-null int64 8 Mortgage 5000 non-null int64 9 Personal_Loan 5000 non-null int64 10 Securities_Account 5000 non-null int64 11 CD_Account 5000 non-null int64 12 Online 5000 non-null int64 13 CreditCard 5000 non-null int64 dtypes: float64(1), int64(13) memory usage: 547.0 KB
loandf1.describe(include="all").T #describe all dataset. Use transpose so column wise details can be view from top to bottom
| count | mean | std | min | 25% | 50% | 75% | max | |
|---|---|---|---|---|---|---|---|---|
| ID | 5000.00000 | 2500.50000 | 1443.52000 | 1.00000 | 1250.75000 | 2500.50000 | 3750.25000 | 5000.00000 |
| Age | 5000.00000 | 45.33840 | 11.46317 | 23.00000 | 35.00000 | 45.00000 | 55.00000 | 67.00000 |
| Experience | 5000.00000 | 20.10460 | 11.46795 | -3.00000 | 10.00000 | 20.00000 | 30.00000 | 43.00000 |
| Income | 5000.00000 | 73.77420 | 46.03373 | 8.00000 | 39.00000 | 64.00000 | 98.00000 | 224.00000 |
| ZIPCode | 5000.00000 | 93169.25700 | 1759.45509 | 90005.00000 | 91911.00000 | 93437.00000 | 94608.00000 | 96651.00000 |
| Family | 5000.00000 | 2.39640 | 1.14766 | 1.00000 | 1.00000 | 2.00000 | 3.00000 | 4.00000 |
| CCAvg | 5000.00000 | 1.93794 | 1.74766 | 0.00000 | 0.70000 | 1.50000 | 2.50000 | 10.00000 |
| Education | 5000.00000 | 1.88100 | 0.83987 | 1.00000 | 1.00000 | 2.00000 | 3.00000 | 3.00000 |
| Mortgage | 5000.00000 | 56.49880 | 101.71380 | 0.00000 | 0.00000 | 0.00000 | 101.00000 | 635.00000 |
| Personal_Loan | 5000.00000 | 0.09600 | 0.29462 | 0.00000 | 0.00000 | 0.00000 | 0.00000 | 1.00000 |
| Securities_Account | 5000.00000 | 0.10440 | 0.30581 | 0.00000 | 0.00000 | 0.00000 | 0.00000 | 1.00000 |
| CD_Account | 5000.00000 | 0.06040 | 0.23825 | 0.00000 | 0.00000 | 0.00000 | 0.00000 | 1.00000 |
| Online | 5000.00000 | 0.59680 | 0.49059 | 0.00000 | 0.00000 | 1.00000 | 1.00000 | 1.00000 |
| CreditCard | 5000.00000 | 0.29400 | 0.45564 | 0.00000 | 0.00000 | 0.00000 | 1.00000 | 1.00000 |
plt.figure(figsize=(20,8))
sns.boxenplot(x=loandf1["Age"]); #using box-and-whisker plot
Most of the customers have age between 35 to 55 years
plt.figure(figsize=(20,8))
sns.boxplot(x=loandf1["Income"]); #using box plot
Highest number of customers have income between 40K to 100K
plt.figure(figsize=(20,8))
sns.countplot(loandf1['Experience']);
Experience range from -3 to 43 years. Need to identify the records having -ve values and process them. -ve experience is not a expected value and may be result of incorrect data extraction. Negative experience need to be changed to 0 experience.
plt.figure(figsize=(20,8))
sns.countplot(loandf1['Family']);
The family size is evenly distributed. All the families in the data have less than or equal to 4 members.
plt.figure(figsize=(20,8))
sns.boxplot(x=loandf1["CCAvg"]); #using box plot
Most of the customers spend between .5K to 2.5K using credit card
plt.figure(figsize=(20,8))
sns.countplot(loandf1['Education']);
Highest number of customers are undergrad followed by Graduates and advanced/professionals.
plt.figure(figsize=(20,8))
sns.boxplot(x=loandf1["Mortgage"]); #using box plot
Value of house Mortgage range from 0 to almost 600K. Most of the mortgages are less than 250K.
plt.figure(figsize=(20,8))
sns.countplot(loandf1['Personal_Loan']);
More than 4.5K customers have opted for personal loan.
plt.figure(figsize=(20,8))
sns.countplot(loandf1['Securities_Account']);
More than 4.5K customers have securities account
plt.figure(figsize=(20,8))
sns.countplot(loandf1['CD_Account']);
More than 4.5K customers have certificate of deposit (CD) account with bank
plt.figure(figsize=(20,8))
sns.countplot(loandf1['Online']);
2K customers (40%) don't have internet banking facility.
plt.figure(figsize=(20,8))
sns.countplot(loandf1['CreditCard']);
3.5 K customers have credit cards from the bank
sns.pairplot(loandf1, hue="Personal_Loan"); #pair plot multiple pair wise bivariate distributions
The pair plot between the values in dataset show the overall distribution against each of the columns. This give an overall view against the selected dataset values.
plt.figure(figsize=(20,8))
sns.barplot(x='Age', y='Personal_Loan', data=loandf1) #Analysis with the target variable (Personal_Loan)
<AxesSubplot:xlabel='Age', ylabel='Personal_Loan'>
Customers <= 25 years and >=65 don't have personal loans
plt.figure(figsize=(20,8))
sns.barplot(x='Experience', y='Personal_Loan', data=loandf1) #Analysis with the target variable (Personal_Loan)
<AxesSubplot:xlabel='Experience', ylabel='Personal_Loan'>
Customers <= 41 years of experience don't have personal loans
plt.figure(figsize=(20,8))
sns.barplot(x='Income', y='Personal_Loan', data=loandf1) #Analysis with the target variable (Personal_Loan)
<AxesSubplot:xlabel='Income', ylabel='Personal_Loan'>
Higher the income higher the chances of taking the personal loan
plt.figure(figsize=(20,8))
sns.boxplot(x="Personal_Loan", y="Family", data=loandf1); #Analysis with the target variable (Price)
Most of the Personal loans were taken by family have 2 or more members.
plt.figure(figsize=(20,8))
sns.barplot(x='CCAvg', y='Personal_Loan', data=loandf1) #Analysis with the target variable (Personal_Loan)
<AxesSubplot:xlabel='CCAvg', ylabel='Personal_Loan'>
Spending on credit card and chances of personal loan are positively correlated
plt.figure(figsize=(20,8))
sns.boxplot(x="Personal_Loan", y="Education", data=loandf1); #Analysis with the target variable (Price)
Graduate and professional are majority of personal loan consumers
plt.figure(figsize=(20,8))
sns.boxplot(x="Personal_Loan", y="Mortgage", data=loandf1); #Analysis with the target variable (Price)
More the mortgage, there are higher chances of consumer taking personal loan.
plt.figure(figsize=(20,8))
sns.barplot(x='Securities_Account', y='Personal_Loan', data=loandf1) #Analysis with the target variable (Personal_Loan)
<AxesSubplot:xlabel='Securities_Account', ylabel='Personal_Loan'>
Customers having security account have higher chances of taking the personal loans.
plt.figure(figsize=(20,8))
sns.barplot(x='CD_Account', y='Personal_Loan', data=loandf1) #Analysis with the target variable (Personal_Loan)
<AxesSubplot:xlabel='CD_Account', ylabel='Personal_Loan'>
Customers having CD account have very high chances of taking the personal loans.
plt.figure(figsize=(20,8))
sns.barplot(x='Online', y='Personal_Loan', data=loandf1) #Analysis with the target variable (Personal_Loan)
<AxesSubplot:xlabel='Online', ylabel='Personal_Loan'>
There is not much difference between the chances of taking the personal loan for customers having internet banking facility or not.
plt.figure(figsize=(20,8))
sns.barplot(x='CreditCard', y='Personal_Loan', data=loandf1) #Analysis with the target variable (Personal_Loan)
<AxesSubplot:xlabel='CreditCard', ylabel='Personal_Loan'>
There is not much difference between the chances of taking the personal loan for customers having credit card or not.
plt.figure(figsize=(20,8))
sns.heatmap(loandf1.corr(),annot=True)
plt.show()
Univariate Assessments:
Bivariate assessment:
Find the number of records with experience less than 0 years
print('Experience',loandf1.query("Experience < 0")['Experience'].count()) #print the count
Experience 52
There are 52 records having experience less than 0 and need
loandf2 = loandf1.copy() #copying dataframe
loandf2['Experience'].replace(-1,0, inplace= True) #replaced by 0
loandf2['Experience'].replace(-2,0, inplace= True) #replaced by 0
loandf2['Experience'].replace(-3,0, inplace= True) #replaced by 0
52 records having less than 0 value was updated with 0.
print('Experience',loandf2.query("Experience < 0")['Experience'].count()) #print the count
Experience 0
# creating dummy varibles
Loandum_data = pd.get_dummies(
loandf2,columns=[
"Family",
"Education",
"Securities_Account",
"CD_Account",
"Online",
"CreditCard",
],
drop_first=True,
)
Loandum_data.head()
| ID | Age | Experience | Income | ZIPCode | CCAvg | Mortgage | Personal_Loan | Family_2 | Family_3 | Family_4 | Education_2 | Education_3 | Securities_Account_1 | CD_Account_1 | Online_1 | CreditCard_1 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1 | 25 | 1 | 49 | 91107 | 1.60000 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 0 |
| 1 | 2 | 45 | 19 | 34 | 90089 | 1.50000 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 1 | 0 | 0 | 0 |
| 2 | 3 | 39 | 15 | 11 | 94720 | 1.00000 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| 3 | 4 | 35 | 9 | 100 | 94112 | 2.70000 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 |
| 4 | 5 | 35 | 8 | 45 | 91330 | 1.00000 | 0 | 0 | 0 | 0 | 1 | 1 | 0 | 0 | 0 | 0 | 1 |
loandflr = Loandum_data.copy() #copying dataframe
from sklearn.model_selection import train_test_split
X = loandflr.drop('Personal_Loan',axis=1) # Predictor feature
Y = loandflr['Personal_Loan'] # Predicted class (1=True, 0=False)
x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.3, random_state=1)
# 1 is just any random seed number
x_train.head()
| ID | Age | Experience | Income | ZIPCode | CCAvg | Mortgage | Family_2 | Family_3 | Family_4 | Education_2 | Education_3 | Securities_Account_1 | CD_Account_1 | Online_1 | CreditCard_1 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 1334 | 1335 | 47 | 22 | 35 | 94304 | 1.30000 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 |
| 4768 | 4769 | 38 | 14 | 39 | 93118 | 2.00000 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 1 | 0 |
| 65 | 66 | 59 | 35 | 131 | 91360 | 3.80000 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 1 |
| 177 | 178 | 29 | 3 | 65 | 94132 | 1.80000 | 244 | 0 | 0 | 1 | 1 | 0 | 0 | 0 | 0 | 0 |
| 4489 | 4490 | 39 | 13 | 21 | 95518 | 0.20000 | 0 | 0 | 1 | 0 | 1 | 0 | 0 | 0 | 1 | 0 |
print("{0:0.2f}% data is in training set".format((len(x_train)/len(loandflr.index)) * 100)) #print percentage
print("{0:0.2f}% data is in test set".format((len(x_test)/len(loandflr.index)) * 100))
70.00% data is in training set 30.00% data is in test set
from sklearn.linear_model import LogisticRegression
from sklearn import metrics
model = LogisticRegression(solver="liblinear") # Fit the model on train
model.fit(x_train, y_train)
y_predict = model.predict(x_test) #predict on test
coef_df = pd.DataFrame(model.coef_)
coef_df['intercept'] = model.intercept_
print(coef_df)
0 1 2 3 4 5 6 7 \
0 -0.00004 -0.00032 -0.00045 0.03609 -0.00006 0.00148 0.00086 -0.00021
8 9 10 11 12 13 14 15 intercept
0 0.00026 0.00018 0.00031 0.00032 0.00003 0.00034 0.00001 0.00001 -0.00000
model_score = model.score(x_test, y_test) #print score
print(model_score)
0.9073333333333333
Model will be further improved by using the decision tree
DecisionTreeClassifier function will be used using default 'gini' criteria to split.
from sklearn import metrics
from sklearn.linear_model import LogisticRegression
model = LogisticRegression(solver="liblinear") # Fit the model on train
model.fit(x_train, y_train)
y_predict = model.predict(x_test) #predict on test
coef_df = pd.DataFrame(model.coef_)
coef_df['intercept'] = model.intercept_
print(coef_df)
0 1 2 3 4 5 6 7 \
0 -0.00004 -0.00032 -0.00045 0.03609 -0.00006 0.00148 0.00086 -0.00021
8 9 10 11 12 13 14 15 intercept
0 0.00026 0.00018 0.00031 0.00032 0.00003 0.00034 0.00001 0.00001 -0.00000
X = Loandum_data.drop("Personal_Loan", axis=1) # drop personal loan
y = Loandum_data["Personal_Loan"] # Target Variable
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=1) # Splitting data into training and test set:
print(X_train.shape, X_test.shape)
(3500, 16) (1500, 16)
print("Number of rows in train data =", X_train.shape[0]) #print no of rows
print("Number of rows in test data =", X_test.shape[0])
Number of rows in train data = 3500 Number of rows in test data = 1500
print("Percentage of classes in training set:") #print the classes
print(y_train.value_counts(normalize=True))
print("Percentage of classes in test set:")
print(y_test.value_counts(normalize=True))
Percentage of classes in training set: 0 0.90543 1 0.09457 Name: Personal_Loan, dtype: float64 Percentage of classes in test set: 0 0.90067 1 0.09933 Name: Personal_Loan, dtype: float64
## Function to calculate recall score, model: classifie, predictors: independent variables,target: dependent variable
def get_recall_score(model, predictors, target):
prediction = model.predict(predictors)
return recall_score(target, prediction)
# To plot the confusion_matrix with percentages, model: classifier, predictors: independent variables,target: dependent variable
def confusion_matrix_sklearn(model, predictors, target):
y_pred = model.predict(predictors)
cm = confusion_matrix(target, y_pred)
labels = np.asarray(
[
["{0:0.0f}".format(item) + "\n{0:.2%}".format(item / cm.flatten().sum())]
for item in cm.flatten()
]
).reshape(2, 2)
plt.figure(figsize=(6, 4))
sns.heatmap(cm, annot=labels, fmt="")
plt.ylabel("True label")
plt.xlabel("Predicted label")
model = DecisionTreeClassifier(
criterion="gini", class_weight={0: 0.15, 1: 0.85}, random_state=1
)
model.fit(X_train, y_train)
DecisionTreeClassifier(class_weight={0: 0.15, 1: 0.85}, random_state=1)
confusion_matrix_sklearn(model, X_train, y_train) #print confusion matrix
decision_tree_perf_train = get_recall_score(model, X_train, y_train) #print recall score
print("Recall Score:", decision_tree_perf_train)
Recall Score: 1.0
confusion_matrix_sklearn(model, X_test, y_test) #Display confusion matrix
decision_tree_perf_test = get_recall_score(model, X_test, y_test) #print recall score
print("Recall Score:", decision_tree_perf_test)
Recall Score: 0.8657718120805369
Although is good difference between the recall score of training and test. The model is overfiting
feature_names = X_train.columns.to_list() #creating a list of column names
plt.figure(figsize=(20, 30)) # below code will add arrows to the decision tree split if they are missing
out = tree.plot_tree(
model,
feature_names=feature_names,
filled=True,
fontsize=9,
node_ids=False,
class_names=None,
)
for o in out:
arrow = o.arrow_patch
if arrow is not None:
arrow.set_edgecolor("black")
arrow.set_linewidth(1)
plt.show()
print(tree.export_text(model, feature_names=feature_names, show_weights=True)) # Text report showing the rules of a decision tree
|--- Income <= 98.50 | |--- CCAvg <= 2.95 | | |--- weights: [374.10, 0.00] class: 0 | |--- CCAvg > 2.95 | | |--- CD_Account_1 <= 0.50 | | | |--- CCAvg <= 3.95 | | | | |--- Income <= 81.50 | | | | | |--- Experience <= 12.50 | | | | | | |--- Family_4 <= 0.50 | | | | | | | |--- CCAvg <= 3.50 | | | | | | | | |--- Age <= 34.00 | | | | | | | | | |--- weights: [0.00, 0.85] class: 1 | | | | | | | | |--- Age > 34.00 | | | | | | | | | |--- weights: [0.00, 0.85] class: 1 | | | | | | | |--- CCAvg > 3.50 | | | | | | | | |--- weights: [0.15, 0.00] class: 0 | | | | | | |--- Family_4 > 0.50 | | | | | | | |--- weights: [0.60, 0.00] class: 0 | | | | | |--- Experience > 12.50 | | | | | | |--- ZIPCode <= 91269.00 | | | | | | | |--- ID <= 1184.50 | | | | | | | | |--- weights: [0.00, 0.85] class: 1 | | | | | | | |--- ID > 1184.50 | | | | | | | | |--- weights: [1.05, 0.00] class: 0 | | | | | | |--- ZIPCode > 91269.00 | | | | | | | |--- Mortgage <= 54.00 | | | | | | | | |--- weights: [4.05, 0.00] class: 0 | | | | | | | |--- Mortgage > 54.00 | | | | | | | | |--- weights: [1.50, 0.00] class: 0 | | | | |--- Income > 81.50 | | | | | |--- ID <= 934.50 | | | | | | |--- weights: [1.35, 0.00] class: 0 | | | | | |--- ID > 934.50 | | | | | | |--- ZIPCode <= 95084.00 | | | | | | | |--- CCAvg <= 3.05 | | | | | | | | |--- weights: [0.60, 0.00] class: 0 | | | | | | | |--- CCAvg > 3.05 | | | | | | | | |--- Mortgage <= 173.00 | | | | | | | | | |--- ID <= 3334.00 | | | | | | | | | | |--- ID <= 1925.00 | | | | | | | | | | | |--- truncated branch of depth 4 | | | | | | | | | | |--- ID > 1925.00 | | | | | | | | | | | |--- truncated branch of depth 2 | | | | | | | | | |--- ID > 3334.00 | | | | | | | | | | |--- weights: [0.00, 5.95] class: 1 | | | | | | | | |--- Mortgage > 173.00 | | | | | | | | | |--- Education_3 <= 0.50 | | | | | | | | | | |--- weights: [0.15, 0.00] class: 0 | | | | | | | | | |--- Education_3 > 0.50 | | | | | | | | | | |--- weights: [0.15, 0.00] class: 0 | | | | | | |--- ZIPCode > 95084.00 | | | | | | | |--- ZIPCode <= 95328.50 | | | | | | | | |--- weights: [0.15, 0.00] class: 0 | | | | | | | |--- ZIPCode > 95328.50 | | | | | | | | |--- weights: [0.75, 0.00] class: 0 | | | |--- CCAvg > 3.95 | | | | |--- weights: [6.75, 0.00] class: 0 | | |--- CD_Account_1 > 0.50 | | | |--- ID <= 766.50 | | | | |--- weights: [0.15, 0.00] class: 0 | | | |--- ID > 766.50 | | | | |--- weights: [0.00, 6.80] class: 1 |--- Income > 98.50 | |--- Education_3 <= 0.50 | | |--- Education_2 <= 0.50 | | | |--- Family_3 <= 0.50 | | | | |--- Family_4 <= 0.50 | | | | | |--- Income <= 100.00 | | | | | | |--- ZIPCode <= 91169.00 | | | | | | | |--- weights: [0.00, 1.70] class: 1 | | | | | | |--- ZIPCode > 91169.00 | | | | | | | |--- weights: [0.45, 0.00] class: 0 | | | | | |--- Income > 100.00 | | | | | | |--- Income <= 103.50 | | | | | | | |--- Securities_Account_1 <= 0.50 | | | | | | | | |--- weights: [2.10, 0.00] class: 0 | | | | | | | |--- Securities_Account_1 > 0.50 | | | | | | | | |--- CreditCard_1 <= 0.50 | | | | | | | | | |--- weights: [0.15, 0.00] class: 0 | | | | | | | | |--- CreditCard_1 > 0.50 | | | | | | | | | |--- weights: [0.00, 0.85] class: 1 | | | | | | |--- Income > 103.50 | | | | | | | |--- ZIPCode <= 90006.00 | | | | | | | | |--- weights: [0.15, 0.00] class: 0 | | | | | | | |--- ZIPCode > 90006.00 | | | | | | | | |--- weights: [64.80, 0.00] class: 0 | | | | |--- Family_4 > 0.50 | | | | | |--- Income <= 102.00 | | | | | | |--- weights: [0.15, 0.00] class: 0 | | | | | |--- Income > 102.00 | | | | | | |--- weights: [0.00, 16.15] class: 1 | | | |--- Family_3 > 0.50 | | | | |--- Income <= 108.50 | | | | | |--- weights: [1.05, 0.00] class: 0 | | | | |--- Income > 108.50 | | | | | |--- ZIPCode <= 90019.50 | | | | | | |--- weights: [0.15, 0.00] class: 0 | | | | | |--- ZIPCode > 90019.50 | | | | | | |--- Age <= 26.00 | | | | | | | |--- weights: [0.15, 0.00] class: 0 | | | | | | |--- Age > 26.00 | | | | | | | |--- Income <= 118.00 | | | | | | | | |--- ID <= 2808.00 | | | | | | | | | |--- weights: [0.00, 1.70] class: 1 | | | | | | | | |--- ID > 2808.00 | | | | | | | | | |--- weights: [0.15, 0.00] class: 0 | | | | | | | |--- Income > 118.00 | | | | | | | | |--- weights: [0.00, 28.05] class: 1 | | |--- Education_2 > 0.50 | | | |--- Income <= 110.50 | | | | |--- CCAvg <= 3.54 | | | | | |--- Income <= 106.50 | | | | | | |--- ID <= 342.00 | | | | | | | |--- weights: [0.15, 0.00] class: 0 | | | | | | |--- ID > 342.00 | | | | | | | |--- weights: [3.75, 0.00] class: 0 | | | | | |--- Income > 106.50 | | | | | | |--- Age <= 52.00 | | | | | | | |--- weights: [0.75, 0.00] class: 0 | | | | | | |--- Age > 52.00 | | | | | | | |--- CCAvg <= 1.85 | | | | | | | | |--- weights: [0.30, 0.00] class: 0 | | | | | | | |--- CCAvg > 1.85 | | | | | | | | |--- weights: [0.00, 0.85] class: 1 | | | | |--- CCAvg > 3.54 | | | | | |--- weights: [0.00, 2.55] class: 1 | | | |--- Income > 110.50 | | | | |--- Income <= 116.50 | | | | | |--- Mortgage <= 141.50 | | | | | | |--- Experience <= 35.50 | | | | | | | |--- CCAvg <= 1.20 | | | | | | | | |--- weights: [0.30, 0.00] class: 0 | | | | | | | |--- CCAvg > 1.20 | | | | | | | | |--- ZIPCode <= 94887.00 | | | | | | | | | |--- CCAvg <= 2.65 | | | | | | | | | | |--- Income <= 113.50 | | | | | | | | | | | |--- weights: [0.00, 1.70] class: 1 | | | | | | | | | | |--- Income > 113.50 | | | | | | | | | | | |--- truncated branch of depth 2 | | | | | | | | | |--- CCAvg > 2.65 | | | | | | | | | | |--- Age <= 31.50 | | | | | | | | | | | |--- weights: [0.00, 0.85] class: 1 | | | | | | | | | | |--- Age > 31.50 | | | | | | | | | | | |--- weights: [0.00, 3.40] class: 1 | | | | | | | | |--- ZIPCode > 94887.00 | | | | | | | | | |--- weights: [0.15, 0.00] class: 0 | | | | | | |--- Experience > 35.50 | | | | | | | |--- Experience <= 38.50 | | | | | | | | |--- weights: [0.15, 0.00] class: 0 | | | | | | | |--- Experience > 38.50 | | | | | | | | |--- weights: [0.30, 0.00] class: 0 | | | | | |--- Mortgage > 141.50 | | | | | | |--- Income <= 112.50 | | | | | | | |--- weights: [0.30, 0.00] class: 0 | | | | | | |--- Income > 112.50 | | | | | | | |--- weights: [0.30, 0.00] class: 0 | | | | |--- Income > 116.50 | | | | | |--- weights: [0.00, 91.80] class: 1 | |--- Education_3 > 0.50 | | |--- Income <= 116.50 | | | |--- CCAvg <= 2.45 | | | | |--- Age <= 41.50 | | | | | |--- weights: [3.60, 0.00] class: 0 | | | | |--- Age > 41.50 | | | | | |--- Experience <= 31.50 | | | | | | |--- Online_1 <= 0.50 | | | | | | | |--- weights: [0.45, 0.00] class: 0 | | | | | | |--- Online_1 > 0.50 | | | | | | | |--- ZIPCode <= 93596.00 | | | | | | | | |--- ZIPCode <= 91859.00 | | | | | | | | | |--- weights: [0.00, 0.85] class: 1 | | | | | | | | |--- ZIPCode > 91859.00 | | | | | | | | | |--- weights: [0.00, 1.70] class: 1 | | | | | | | |--- ZIPCode > 93596.00 | | | | | | | | |--- weights: [0.15, 0.00] class: 0 | | | | | |--- Experience > 31.50 | | | | | | |--- ID <= 639.00 | | | | | | | |--- weights: [0.15, 0.00] class: 0 | | | | | | |--- ID > 639.00 | | | | | | | |--- weights: [1.35, 0.00] class: 0 | | | |--- CCAvg > 2.45 | | | | |--- ZIPCode <= 90389.50 | | | | | |--- weights: [0.15, 0.00] class: 0 | | | | |--- ZIPCode > 90389.50 | | | | | |--- ID <= 4852.50 | | | | | | |--- ID <= 4505.50 | | | | | | | |--- CD_Account_1 <= 0.50 | | | | | | | | |--- Income <= 99.50 | | | | | | | | | |--- CCAvg <= 4.00 | | | | | | | | | | |--- CCAvg <= 3.25 | | | | | | | | | | | |--- weights: [0.00, 0.85] class: 1 | | | | | | | | | | |--- CCAvg > 3.25 | | | | | | | | | | | |--- weights: [0.00, 0.85] class: 1 | | | | | | | | | |--- CCAvg > 4.00 | | | | | | | | | | |--- weights: [0.15, 0.00] class: 0 | | | | | | | | |--- Income > 99.50 | | | | | | | | | |--- weights: [0.00, 10.20] class: 1 | | | | | | | |--- CD_Account_1 > 0.50 | | | | | | | | |--- ZIPCode <= 93254.50 | | | | | | | | | |--- weights: [0.15, 0.00] class: 0 | | | | | | | | |--- ZIPCode > 93254.50 | | | | | | | | | |--- weights: [0.00, 0.85] class: 1 | | | | | | |--- ID > 4505.50 | | | | | | | |--- ID <= 4731.50 | | | | | | | | |--- weights: [0.30, 0.00] class: 0 | | | | | | | |--- ID > 4731.50 | | | | | | | | |--- weights: [0.00, 0.85] class: 1 | | | | | |--- ID > 4852.50 | | | | | | |--- weights: [0.15, 0.00] class: 0 | | |--- Income > 116.50 | | | |--- CreditCard_1 <= 0.50 | | | | |--- weights: [0.00, 66.30] class: 1 | | | |--- CreditCard_1 > 0.50 | | | | |--- weights: [0.00, 30.60] class: 1
# importance of features
print(
pd.DataFrame(
model.feature_importances_, columns=["Imp"], index=X_train.columns
).sort_values(by="Imp", ascending=False)
)
Imp Income 0.59316 Education_2 0.08813 CCAvg 0.08163 Family_4 0.07179 Family_3 0.07032 Education_3 0.03515 ID 0.01435 CD_Account_1 0.01110 ZIPCode 0.01062 Experience 0.00916 Age 0.00569 Mortgage 0.00347 Securities_Account_1 0.00277 Online_1 0.00195 CreditCard_1 0.00072 Family_2 0.00000
importances = model.feature_importances_
indices = np.argsort(importances)
plt.figure(figsize=(12, 12))
plt.title("Feature Importances")
plt.barh(range(len(indices)), importances[indices], color="violet", align="center")
plt.yticks(range(len(indices)), [feature_names[i] for i in indices])
plt.xlabel("Relative Importance")
plt.show()
# Choose the type of classifier.
estimator = DecisionTreeClassifier(random_state=1, class_weight={0: 0.15, 1: 0.85})
# Grid of parameters to choose from
parameters = {
"max_depth": [5, 10, 15, None],
"criterion": ["entropy", "gini"],
"splitter": ["best", "random"],
"min_impurity_decrease": [0.00001, 0.0001, 0.01],
}
# Type of scoring used to compare parameter combinations
scorer = make_scorer(recall_score)
# Run the grid search
grid_obj = GridSearchCV(estimator, parameters, scoring=scorer, cv=5)
grid_obj = grid_obj.fit(X_train, y_train)
# Set the clf to the best combination of parameters
estimator = grid_obj.best_estimator_
# Fit the best algorithm to the data.
estimator.fit(X_train, y_train)
DecisionTreeClassifier(class_weight={0: 0.15, 1: 0.85}, max_depth=5,
min_impurity_decrease=0.01, random_state=1)
confusion_matrix_sklearn(estimator, X_train, y_train) #display confusion matrix
decision_tree_tune_perf_train = get_recall_score(estimator, X_train, y_train) #print recall score
print("Recall Score:", decision_tree_tune_perf_train)
Recall Score: 0.9909365558912386
plt.figure(figsize=(15, 10)) #display the discision tree
out = tree.plot_tree(
estimator,
feature_names=feature_names,
filled=True,
fontsize=9,
node_ids=False,
class_names=None,
)
for o in out:
arrow = o.arrow_patch
if arrow is not None:
arrow.set_edgecolor("black")
arrow.set_linewidth(1)
plt.show()
print(tree.export_text(estimator, feature_names=feature_names, show_weights=True)) # Text report showing the rules of a decision tree -
|--- Income <= 98.50 | |--- CCAvg <= 2.95 | | |--- weights: [374.10, 0.00] class: 0 | |--- CCAvg > 2.95 | | |--- weights: [18.60, 18.70] class: 1 |--- Income > 98.50 | |--- Education_3 <= 0.50 | | |--- Education_2 <= 0.50 | | | |--- Family_3 <= 0.50 | | | | |--- Family_4 <= 0.50 | | | | | |--- weights: [67.65, 2.55] class: 0 | | | | |--- Family_4 > 0.50 | | | | | |--- weights: [0.15, 16.15] class: 1 | | | |--- Family_3 > 0.50 | | | | |--- weights: [1.50, 29.75] class: 1 | | |--- Education_2 > 0.50 | | | |--- weights: [6.75, 101.15] class: 1 | |--- Education_3 > 0.50 | | |--- weights: [6.60, 113.05] class: 1
# importance of features in the tree building ( The importance of a feature is computed as the
# (normalized) total reduction of the 'criterion' brought by that feature. It is also known as the Gini importance )
print(
pd.DataFrame(
estimator.feature_importances_, columns=["Imp"], index=X_train.columns
).sort_values(by="Imp", ascending=False)
)
Imp Income 0.63668 Education_2 0.10328 Family_3 0.08241 Family_4 0.07991 CCAvg 0.05653 Education_3 0.04119 ID 0.00000 Age 0.00000 Experience 0.00000 ZIPCode 0.00000 Mortgage 0.00000 Family_2 0.00000 Securities_Account_1 0.00000 CD_Account_1 0.00000 Online_1 0.00000 CreditCard_1 0.00000
importances = estimator.feature_importances_
indices = np.argsort(importances)
plt.figure(figsize=(12, 12))
plt.title("Feature Importances")
plt.barh(range(len(indices)), importances[indices], color="violet", align="center")
plt.yticks(range(len(indices)), [feature_names[i] for i in indices])
plt.xlabel("Relative Importance")
plt.show()
clf = DecisionTreeClassifier(random_state=1, class_weight={0: 0.15, 1: 0.85})
path = clf.cost_complexity_pruning_path(X_train, y_train)
ccp_alphas, impurities = path.ccp_alphas, path.impurities
pd.DataFrame(path)
| ccp_alphas | impurities | |
|---|---|---|
| 0 | 0.00000 | -0.00000 |
| 1 | 0.00000 | -0.00000 |
| 2 | 0.00000 | -0.00000 |
| 3 | 0.00000 | -0.00000 |
| 4 | 0.00000 | -0.00000 |
| 5 | 0.00000 | -0.00000 |
| 6 | 0.00000 | -0.00000 |
| 7 | 0.00000 | -0.00000 |
| 8 | 0.00000 | -0.00000 |
| 9 | 0.00000 | -0.00000 |
| 10 | 0.00000 | -0.00000 |
| 11 | 0.00000 | -0.00000 |
| 12 | 0.00000 | -0.00000 |
| 13 | 0.00000 | -0.00000 |
| 14 | 0.00000 | -0.00000 |
| 15 | 0.00000 | -0.00000 |
| 16 | 0.00000 | -0.00000 |
| 17 | 0.00019 | 0.00077 |
| 18 | 0.00020 | 0.00117 |
| 19 | 0.00034 | 0.00151 |
| 20 | 0.00036 | 0.00296 |
| 21 | 0.00036 | 0.00333 |
| 22 | 0.00037 | 0.00443 |
| 23 | 0.00037 | 0.00481 |
| 24 | 0.00038 | 0.00519 |
| 25 | 0.00039 | 0.00557 |
| 26 | 0.00039 | 0.00635 |
| 27 | 0.00039 | 0.00674 |
| 28 | 0.00059 | 0.00733 |
| 29 | 0.00065 | 0.00799 |
| 30 | 0.00066 | 0.00864 |
| 31 | 0.00067 | 0.00931 |
| 32 | 0.00068 | 0.00999 |
| 33 | 0.00079 | 0.01235 |
| 34 | 0.00088 | 0.01323 |
| 35 | 0.00091 | 0.01414 |
| 36 | 0.00094 | 0.01508 |
| 37 | 0.00094 | 0.01696 |
| 38 | 0.00100 | 0.01895 |
| 39 | 0.00101 | 0.01997 |
| 40 | 0.00101 | 0.02098 |
| 41 | 0.00102 | 0.02200 |
| 42 | 0.00112 | 0.02311 |
| 43 | 0.00147 | 0.02458 |
| 44 | 0.00164 | 0.02622 |
| 45 | 0.00169 | 0.02959 |
| 46 | 0.00184 | 0.03144 |
| 47 | 0.00260 | 0.03404 |
| 48 | 0.00274 | 0.03678 |
| 49 | 0.00334 | 0.04012 |
| 50 | 0.00341 | 0.04353 |
| 51 | 0.00353 | 0.04706 |
| 52 | 0.00480 | 0.05665 |
| 53 | 0.00514 | 0.06179 |
| 54 | 0.00673 | 0.06851 |
| 55 | 0.02253 | 0.09105 |
| 56 | 0.03057 | 0.21334 |
| 57 | 0.25380 | 0.46714 |
fig, ax = plt.subplots(figsize=(10, 5))
ax.plot(ccp_alphas[:-1], impurities[:-1], marker="o", drawstyle="steps-post")
ax.set_xlabel("effective alpha")
ax.set_ylabel("total impurity of leaves")
ax.set_title("Total Impurity vs effective alpha for training set")
plt.show()
clfs = []
for ccp_alpha in ccp_alphas:
clf = DecisionTreeClassifier(
random_state=1, ccp_alpha=ccp_alpha, class_weight={0: 0.15, 1: 0.85}
)
clf.fit(X_train, y_train)
clfs.append(clf)
print(
"Number of nodes in the last tree is: {} with ccp_alpha: {}".format(
clfs[-1].tree_.node_count, ccp_alphas[-1]
)
)
Number of nodes in the last tree is: 1 with ccp_alpha: 0.25379571489480923
clfs = clfs[:-1]
ccp_alphas = ccp_alphas[:-1]
node_counts = [clf.tree_.node_count for clf in clfs]
depth = [clf.tree_.max_depth for clf in clfs]
fig, ax = plt.subplots(2, 1, figsize=(10, 7))
ax[0].plot(ccp_alphas, node_counts, marker="o", drawstyle="steps-post")
ax[0].set_xlabel("alpha")
ax[0].set_ylabel("number of nodes")
ax[0].set_title("Number of nodes vs alpha")
ax[1].plot(ccp_alphas, depth, marker="o", drawstyle="steps-post")
ax[1].set_xlabel("alpha")
ax[1].set_ylabel("depth of tree")
ax[1].set_title("Depth vs alpha")
fig.tight_layout()
recall_train = []
for clf in clfs:
pred_train = clf.predict(X_train)
values_train = recall_score(y_train, pred_train)
recall_train.append(values_train)
recall_test = []
for clf in clfs:
pred_test = clf.predict(X_test)
values_test = recall_score(y_test, pred_test)
recall_test.append(values_test)
fig, ax = plt.subplots(figsize=(15, 5))
ax.set_xlabel("alpha")
ax.set_ylabel("Recall")
ax.set_title("Recall vs alpha for training and testing sets")
ax.plot(
ccp_alphas, recall_train, marker="o", label="train", drawstyle="steps-post",
)
ax.plot(ccp_alphas, recall_test, marker="o", label="test", drawstyle="steps-post")
ax.legend()
plt.show()
# creating the model where we get highest train and test recall
index_best_model = np.argmax(recall_test)
best_model = clfs[index_best_model]
print(best_model)
DecisionTreeClassifier(ccp_alpha=0.0067258136904069215,
class_weight={0: 0.15, 1: 0.85}, random_state=1)
best_model.fit(X_train, y_train)
DecisionTreeClassifier(ccp_alpha=0.0067258136904069215,
class_weight={0: 0.15, 1: 0.85}, random_state=1)
confusion_matrix_sklearn(best_model, X_train, y_train)
print("Recall Score:", get_recall_score(best_model, X_train, y_train))
Recall Score: 0.9909365558912386
confusion_matrix_sklearn(best_model, X_test, y_test)
print("Recall Score:", get_recall_score(best_model, X_test, y_test))
Recall Score: 0.9865771812080537
plt.figure(figsize=(5, 5))
out = tree.plot_tree(
best_model,
feature_names=feature_names,
filled=True,
fontsize=9,
node_ids=False,
class_names=None,
)
for o in out:
arrow = o.arrow_patch
if arrow is not None:
arrow.set_edgecolor("black")
arrow.set_linewidth(1)
plt.show()
# Text report showing the rules of a decision tree -
print(tree.export_text(best_model, feature_names=feature_names, show_weights=True))
|--- Income <= 98.50 | |--- CCAvg <= 2.95 | | |--- weights: [374.10, 0.00] class: 0 | |--- CCAvg > 2.95 | | |--- weights: [18.60, 18.70] class: 1 |--- Income > 98.50 | |--- Education_3 <= 0.50 | | |--- Education_2 <= 0.50 | | | |--- Family_3 <= 0.50 | | | | |--- Family_4 <= 0.50 | | | | | |--- weights: [67.65, 2.55] class: 0 | | | | |--- Family_4 > 0.50 | | | | | |--- weights: [0.15, 16.15] class: 1 | | | |--- Family_3 > 0.50 | | | | |--- weights: [1.50, 29.75] class: 1 | | |--- Education_2 > 0.50 | | | |--- weights: [6.75, 101.15] class: 1 | |--- Education_3 > 0.50 | | |--- weights: [6.60, 113.05] class: 1
According to the final decision tree model: